#include "zMatrixLapack.hpp"
#include "zMatrixConstant.hpp"
//use lapack to do high level math

#include <3rdParty/BlasLapack.hpp>
#include <Math/Math.hpp>

namespace zzz{
#ifdef ZZZ_LIB_LAPACK
bool LUFactorization(zMatrix<double,zColMajor> &mat, zMatrix<integer,zColMajor> &IPIV, integer *info)
{
  /*
  *  M       (input) INTEGER
  *          The number of rows of the matrix A.  M >= 0.
  *
  *  N       (input) INTEGER
  *          The number of columns of the matrix A.  N >= 0.
  *
  *  A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
  *          On entry, the M-by-N matrix to be factored.
  *          On exit, the factors L and U from the factorization
  *          A = P*L*U; the unit diagonal elements of L are not stored.
  *
  *  LDA     (input) INTEGER
  *          The leading dimension of the array A.  LDA >= max(1,M).
  *
  *  IPIV    (output) INTEGER array, dimension (min(M,N))
  *          The pivot indices; for 1 <= i <= min(M,N), row i of the
  *          matrix was interchanged with row IPIV(i).
  *
  *  INFO    (output) INTEGER
  *          = 0:  successful exit
  *          < 0:  if INFO = -i, the i-th argument had an illegal value
  *          > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
  *                has been completed, but the factor U is exactly
  *                singular, and division by zero will occur if it is used
  *                to solve a system of equations.
  */
  integer M=mat.Size(0),N=mat.Size(1),LDA=M,INFO;
  IPIV.SetSize(Min<int>(M,N));
  dgetrf_(&M,&N,mat.Data(),&LDA,IPIV.Data(),&INFO);
  if (info!=NULL) *info=INFO;
  return INFO==0;
}


//ATTENTION: original value will not be kept even if failed
bool Invert(zMatrix<double,zColMajor> &mat, integer *info)
{
  zMatrix<integer,zColMajor> IPIV;
  if (!LUFactorization(mat,IPIV,info)) return false;
  /*
  *  N       (input) INTEGER
  *          The order of the matrix A.  N >= 0.
  *
  *  A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
  *          On entry, the factors L and U from the factorization
  *          A = P*L*U as computed by DGETRF.
  *          On exit, if INFO = 0, the inverse of the original matrix A.
  *
  *  LDA     (input) INTEGER
  *          The leading dimension of the array A.  LDA >= max(1,N).
  *
  *  IPIV    (input) INTEGER array, dimension (N)
  *          The pivot indices from DGETRF; for 1<=i<=N, row i of the
  *          matrix was interchanged with row IPIV(i).
  *
  *  WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
  *          On exit, if INFO=0, then WORK(1) returns the optimal LWORK.
  *
  *  LWORK   (input) INTEGER
  *          The dimension of the array WORK.  LWORK >= max(1,N).
  *          For optimal performance LWORK >= N*NB, where NB is
  *          the optimal blocksize returned by ILAENV.
  *
  *          If LWORK = -1, then a workspace query is assumed; the routine
  *          only calculates the optimal size of the WORK array, returns
  *          this value as the first entry of the WORK array, and no error
  *          message related to LWORK is issued by XERBLA.
  *
  *  INFO    (output) INTEGER
  *          = 0:  successful exit
  *          < 0:  if INFO = -i, the i-th argument had an illegal value
  *          > 0:  if INFO = i, U(i,i) is exactly zero; the matrix is
  *                singular and its inverse could not be computed.
  */
  ZCHECK_EQ(mat.Size(0), mat.Size(1));
  integer N=mat.Size(0);
  integer LDA=N;

  //ilaenv is not compiled in my clapack lib
  //  integer ISPEC=1;
  //  integer N1=N;
  //  integer N2=-1;
  //  integer N3=-1;
  //  integer N4=-1;
  //  integer NB=ilaenv_(&ISPEC,"DGETRI","",&N1,&N2,&N3,&N4,6,0);
  //  printf("%d\n",NB);
  //  integer LWORK=N*NB;

  integer LWORK=N;
  zMatrix<double,zColMajor> WORK(LWORK);
  integer INFO;

  dgetri_(&N, mat.Data(), &LDA, IPIV.Data(), WORK.Data(), &LWORK, &INFO);

  if (info!=NULL) *info=INFO;
  return INFO==0;
}

bool SVD(zMatrix<double,zColMajor> &A, zMatrix<double,zColMajor> &U, zMatrix<double,zColMajor> &S, zMatrix<double,zColMajor> &VT, integer *info)
{
  /*
  *  JOBU    (input) CHARACTER*1
  *          Specifies options for computing all or part of the matrix U:
  *          = 'A':  all M columns of U are returned in array U:
  *          = 'S':  the first min(m,n) columns of U (the left singular
  *                  vectors) are returned in the array U;
  *          = 'O':  the first min(m,n) columns of U (the left singular
  *                  vectors) are overwritten on the array A;
  *          = 'N':  no columns of U (no left singular vectors) are
  *                  computed.
  *
  *  JOBVT   (input) CHARACTER*1
  *          Specifies options for computing all or part of the matrix
  *          V**T:
  *          = 'A':  all N rows of V**T are returned in the array VT;
  *          = 'S':  the first min(m,n) rows of V**T (the right singular
  *                  vectors) are returned in the array VT;
  *          = 'O':  the first min(m,n) rows of V**T (the right singular
  *                  vectors) are overwritten on the array A;
  *          = 'N':  no rows of V**T (no right singular vectors) are
  *                  computed.
  *
  *          JOBVT and JOBU cannot both be 'O'.
  *
  *  M       (input) INTEGER
  *          The number of rows of the input matrix A.  M >= 0.
  *
  *  N       (input) INTEGER
  *          The number of columns of the input matrix A.  N >= 0.
  *
  *  A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
  *          On entry, the M-by-N matrix A.
  *          On exit,
  *          if JOBU = 'O',  A is overwritten with the first min(m,n)
  *                          columns of U (the left singular vectors,
  *                          stored columnwise);
  *          if JOBVT = 'O', A is overwritten with the first min(m,n)
  *                          rows of V**T (the right singular vectors,
  *                          stored rowwise);
  *          if JOBU .ne. 'O' and JOBVT .ne. 'O', the contents of A
  *                          are destroyed.
  *
  *  LDA     (input) INTEGER
  *          The leading dimension of the array A.  LDA >= max(1,M).
  *
  *  S       (output) DOUBLE PRECISION array, dimension (min(M,N))
  *          The singular values of A, sorted so that S(i) >= S(i+1).
  *
  *  U       (output) DOUBLE PRECISION array, dimension (LDU,UCOL)
  *          (LDU,M) if JOBU = 'A' or (LDU,min(M,N)) if JOBU = 'S'.
  *          If JOBU = 'A', U contains the M-by-M orthogonal matrix U;
  *          if JOBU = 'S', U contains the first min(m,n) columns of U
  *          (the left singular vectors, stored columnwise);
  *          if JOBU = 'N' or 'O', U is not referenced.
  *
  *  LDU     (input) INTEGER
  *          The leading dimension of the array U.  LDU >= 1; if
  *          JOBU = 'S' or 'A', LDU >= M.
  *
  *  VT      (output) DOUBLE PRECISION array, dimension (LDVT,N)
  *          If JOBVT = 'A', VT contains the N-by-N orthogonal matrix
  *          V**T;
  *          if JOBVT = 'S', VT contains the first min(m,n) rows of
  *          V**T (the right singular vectors, stored rowwise);
  *          if JOBVT = 'N' or 'O', VT is not referenced.
  *
  *  LDVT    (input) INTEGER
  *          The leading dimension of the array VT.  LDVT >= 1; if
  *          JOBVT = 'A', LDVT >= N; if JOBVT = 'S', LDVT >= min(M,N).
  *
  *  WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
  *          On exit, if INFO = 0, WORK(1) returns the optimal LWORK;
  *          if INFO > 0, WORK(2:MIN(M,N)) contains the unconverged
  *          superdiagonal elements of an upper bidiagonal matrix B
  *          whose diagonal is in S (not necessarily sorted). B
  *          satisfies A = U * B * VT, so it has the same singular values
  *          as A, and singular vectors related by U and VT.
  *
  *  LWORK   (input) INTEGER
  *          The dimension of the array WORK.
  *          LWORK >= MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)).
  *          For good performance, LWORK should generally be larger.
  *
  *          If LWORK = -1, then a workspace query is assumed; the routine
  *          only calculates the optimal size of the WORK array, returns
  *          this value as the first entry of the WORK array, and no error
  *          message related to LWORK is issued by XERBLA.
  *
  *  INFO    (output) INTEGER
  *          = 0:  successful exit.
  *          < 0:  if INFO = -i, the i-th argument had an illegal value.
  *          > 0:  if DBDSQR did not converge, INFO specifies how many
  *                superdiagonals of an intermediate bidiagonal form B
  *                did not converge to zero. See the description of WORK
  *                above for details.
  */
  integer M = A.Size(0), N = A.Size(1), INFO;
  integer LDA = M, LDU = M, LDVT = N;
  integer minMN = Min<integer>(M,N);
  integer LWORK = Max<integer>(3*minMN+Max<integer>(M,N), 5*minMN);
  zMatrix<double> WORK(LWORK,1);
  U.SetSize(LDU, M); 
  VT.SetSize(LDVT, N);
  S.SetSize(minMN);
  dgesvd_("A", "A", &M, &N, A.Data(), &LDA, S.Data(), U.Data(), &LDU,
    VT.Data(), &LDVT, WORK.Data(), &LWORK, &INFO);
  if (info!=NULL) *info=INFO;
  return INFO==0;
}

double Det(zMatrix<double,zColMajor> &A)
{
  ZCHECK_EQ(A.Size(0), A.Size(1));
  zMatrix<integer> IPIV;
  LUFactorization(A, IPIV);
  double res = 1.0f;
  for(zuint i=0; i<A.Size(0); i++)
    res *= A(i,i);
  for(zuint i=0; i<IPIV.size(); i++)
    if (i!=IPIV[i]-1) res *= -1.0f;
  return res;
};


bool EigenSym(zMatrix<double,zColMajor> &A, zMatrix<double,zColMajor> &e, integer *info)
{
  /*
  *  JOBZ    (input) CHARACTER*1
  *          = 'N':  Compute eigenvalues only;
  *          = 'V':  Compute eigenvalues and eigenvectors.
  *
  *  UPLO    (input) CHARACTER*1
  *          = 'U':  Upper triangle of A is stored;
  *          = 'L':  Lower triangle of A is stored.
  *
  *  N       (input) INTEGER
  *          The order of the matrix A.  N >= 0.
  *
  *  A       (input/output) DOUBLE PRECISION array, dimension (LDA, N)
  *          On entry, the symmetric matrix A.  If UPLO = 'U', the
  *          leading N-by-N upper triangular part of A contains the
  *          upper triangular part of the matrix A.  If UPLO = 'L',
  *          the leading N-by-N lower triangular part of A contains
  *          the lower triangular part of the matrix A.
  *          On exit, if JOBZ = 'V', then if INFO = 0, A contains the
  *          orthonormal eigenvectors of the matrix A.
  *          If JOBZ = 'N', then on exit the lower triangle (if UPLO='L')
  *          or the upper triangle (if UPLO='U') of A, including the
  *          diagonal, is destroyed.
  *
  *  LDA     (input) INTEGER
  *          The leading dimension of the array A.  LDA >= max(1,N).
  *
  *  W       (output) DOUBLE PRECISION array, dimension (N)
  *          If INFO = 0, the eigenvalues in ascending order.
  *
  *  WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
  *          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
  *
  *  LWORK   (input) INTEGER
  *          The length of the array WORK.  LWORK >= max(1,3*N-1).
  *          For optimal efficiency, LWORK >= (NB+2)*N,
  *          where NB is the blocksize for DSYTRD returned by ILAENV.
  *
  *          If LWORK = -1, then a workspace query is assumed; the routine
  *          only calculates the optimal size of the WORK array, returns
  *          this value as the first entry of the WORK array, and no error
  *          message related to LWORK is issued by XERBLA.
  *
  *  INFO    (output) INTEGER
  *          = 0:  successful exit
  *          < 0:  if INFO = -i, the i-th argument had an illegal value
  *          > 0:  if INFO = i, the algorithm failed to converge; i
  *                off-diagonal elements of an intermediate tridiagonal
  *                form did not converge to zero.
  */
  ZCHECK_EQ(A.Size(0), A.Size(1));
  integer N = A.Size(1), LWORK = -1, INFO;
  e.SetSize(A.Size(0));
  zMatrix<double> WORK(1);
  dsyev_("V", "U", &N, A.Data(), &N, e.Data(), WORK.Data(), &LWORK, &INFO);
  LWORK = (integer)(WORK[0]+0.5);
  WORK.SetSize(LWORK);
  dsyev_("V", "U", &N, A.Data(), &N, e.Data(), WORK.Data(), &LWORK, &INFO);
  if (info!=NULL) *info=INFO;
  return INFO==0;
}

bool EigRight(zMatrix<double,zColMajor> &A, zMatrix<double,zColMajor> &eigenVector, zMatrix<double,zColMajor> &eigenValueReal, zMatrix<double,zColMajor> &eigenValueImag, integer *info)
{
  /*
  *  JOBVL   (input) CHARACTER*1
  *          = 'N': left eigenvectors of A are not computed;
  *          = 'V': left eigenvectors of A are computed.
  *
  *  JOBVR   (input) CHARACTER*1
  *          = 'N': right eigenvectors of A are not computed;
  *          = 'V': right eigenvectors of A are computed.
  *
  *  N       (input) INTEGER
  *          The order of the matrix A. N >= 0.
  *
  *  A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
  *          On entry, the N-by-N matrix A.
  *          On exit, A has been overwritten.
  *
  *  LDA     (input) INTEGER
  *          The leading dimension of the array A.  LDA >= max(1,N).
  *
  *  WR      (output) DOUBLE PRECISION array, dimension (N)
  *  WI      (output) DOUBLE PRECISION array, dimension (N)
  *          WR and WI contain the real and imaginary parts,
  *          respectively, of the computed eigenvalues.  Complex
  *          conjugate pairs of eigenvalues appear consecutively
  *          with the eigenvalue having the positive imaginary part
  *          first.
  *
  *  VL      (output) DOUBLE PRECISION array, dimension (LDVL,N)
  *          If JOBVL = 'V', the left eigenvectors u(j) are stored one
  *          after another in the columns of VL, in the same order
  *          as their eigenvalues.
  *          If JOBVL = 'N', VL is not referenced.
  *          If the j-th eigenvalue is real, then u(j) = VL(:,j),
  *          the j-th column of VL.
  *          If the j-th and (j+1)-st eigenvalues form a complex
  *          conjugate pair, then u(j) = VL(:,j) + i*VL(:,j+1) and
  *          u(j+1) = VL(:,j) - i*VL(:,j+1).
  *
  *  LDVL    (input) INTEGER
  *          The leading dimension of the array VL.  LDVL >= 1; if
  *          JOBVL = 'V', LDVL >= N.
  *
  *  VR      (output) DOUBLE PRECISION array, dimension (LDVR,N)
  *          If JOBVR = 'V', the right eigenvectors v(j) are stored one
  *          after another in the columns of VR, in the same order
  *          as their eigenvalues.
  *          If JOBVR = 'N', VR is not referenced.
  *          If the j-th eigenvalue is real, then v(j) = VR(:,j),
  *          the j-th column of VR.
  *          If the j-th and (j+1)-st eigenvalues form a complex
  *          conjugate pair, then v(j) = VR(:,j) + i*VR(:,j+1) and
  *          v(j+1) = VR(:,j) - i*VR(:,j+1).
  *
  *  LDVR    (input) INTEGER
  *          The leading dimension of the array VR.  LDVR >= 1; if
  *          JOBVR = 'V', LDVR >= N.
  *
  *  WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
  *          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
  *
  *  LWORK   (input) INTEGER
  *          The dimension of the array WORK.  LWORK >= max(1,3*N), and
  *          if JOBVL = 'V' or JOBVR = 'V', LWORK >= 4*N.  For good
  *          performance, LWORK must generally be larger.
  *
  *          If LWORK = -1, then a workspace query is assumed; the routine
  *          only calculates the optimal size of the WORK array, returns
  *          this value as the first entry of the WORK array, and no error
  *          message related to LWORK is issued by XERBLA.
  *
  *  INFO    (output) INTEGER
  *          = 0:  successful exit
  *          < 0:  if INFO = -i, the i-th argument had an illegal value.
  *          > 0:  if INFO = i, the QR algorithm failed to compute all the
  *                eigenvalues, and no eigenvectors have been computed;
  *                elements i+1:N of WR and WI contain eigenvalues which
  *                have converged.
  */
  ZCHECK_EQ(A.Size(0), A.Size(1));
  char JOBVL = 'N';  /* Don't compute left eigenvectors */
  char JOBVR = 'V';  /* Do compute right eigenvectors */
  integer N = A.Size(1);
  integer LDA = N;
  eigenValueReal.SetSize(N,1);
  eigenValueImag.SetSize(N,1);
  double *VL = NULL;
  integer LDVL = 1;
  eigenVector.SetSize(N,N);
  integer LDVR = N;
  integer LWORK ; //query first
  zMatrix<double,zColMajor> WORK(1,1);
  integer INFO;

  /* Query dgeev for the optimal value of lwork */
  LWORK = -1;
  dgeev_(&JOBVL, &JOBVR, &N, A.Data(), &LDA, eigenValueReal.Data(), eigenValueImag.Data(), VL, &LDVL, eigenVector.Data(), &LDVR, WORK.Data(), &LWORK, &INFO);
  LWORK = (int) WORK(0,0);
  WORK.SetSize(LWORK,1);

  /* Make the call to dgeev */
  dgeev_(&JOBVL, &JOBVR, &N, A.Data(), &LDA, eigenValueReal.Data(), eigenValueImag.Data(), VL, &LDVL, eigenVector.Data(), &LDVR, WORK.Data(), &LWORK, &INFO);

  if (info!=NULL) *info=INFO;
  return INFO==0;
}

bool SolveLinearLeastSquare(zMatrix<double,zColMajor> &A, zMatrix<double,zColMajor> &B_X, integer *info)
{
  /*
  *  DGELSS computes the minimum norm solution to a real linear least
  *  squares problem:
  *
  *  Minimize 2-norm(| b - A*x |).
  *
  *  using the singular value decomposition (SVD) of A. A is an M-by-N
  *  matrix which may be rank-deficient.
  *
  *  Several right hand side vectors b and solution vectors x can be
  *  handled in a single call; they are stored as the columns of the
  *  M-by-NRHS right hand side matrix B and the N-by-NRHS solution matrix
  *  X.
  *
  *  The effective rank of A is determined by treating as zero those
  *  singular values which are less than RCOND times the largest singular
  *  value.
  *
  *  Arguments
  *  =========
  *
  *  M       (input) INTEGER
  *          The number of rows of the matrix A. M >= 0.
  *
  *  N       (input) INTEGER
  *          The number of columns of the matrix A. N >= 0.
  *
  *  NRHS    (input) INTEGER
  *          The number of right hand sides, i.e., the number of columns
  *          of the matrices B and X. NRHS >= 0.
  *
  *  A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
  *          On entry, the M-by-N matrix A.
  *          On exit, the first min(m,n) rows of A are overwritten with
  *          its right singular vectors, stored rowwise.
  *
  *  LDA     (input) INTEGER
  *          The leading dimension of the array A.  LDA >= max(1,M).
  *
  *  B       (input/output) DOUBLE PRECISION array, dimension (LDB,NRHS)
  *          On entry, the M-by-NRHS right hand side matrix B.
  *          On exit, B is overwritten by the N-by-NRHS solution
  *          matrix X.  If m >= n and RANK = n, the residual
  *          sum-of-squares for the solution in the i-th column is given
  *          by the sum of squares of elements n+1:m in that column.
  *
  *  LDB     (input) INTEGER
  *          The leading dimension of the array B. LDB >= max(1,max(M,N)).
  *
  *  S       (output) DOUBLE PRECISION array, dimension (min(M,N))
  *          The singular values of A in decreasing order.
  *          The condition number of A in the 2-norm = S(1)/S(min(m,n)).
  *
  *  RCOND   (input) DOUBLE PRECISION
  *          RCOND is used to determine the effective rank of A.
  *          Singular values S(i) <= RCOND*S(1) are treated as zero.
  *          If RCOND < 0, machine precision is used instead.
  *
  *  RANK    (output) INTEGER
  *          The effective rank of A, i.e., the number of singular values
  *          which are greater than RCOND*S(1).
  *
  *  WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
  *          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
  *
  *  LWORK   (input) INTEGER
  *          The dimension of the array WORK. LWORK >= 1, and also:
  *          LWORK >= 3*min(M,N) + max(2*min(M,N), max(M,N), NRHS)
  *          For good performance, LWORK should generally be larger.
  *
  *          If LWORK = -1, then a workspace query is assumed; the routine
  *          only calculates the optimal size of the WORK array, returns
  *          this value as the first entry of the WORK array, and no error
  *          message related to LWORK is issued by XERBLA.
  *
  *  INFO    (output) INTEGER
  *          = 0:  successful exit
  *          < 0:  if INFO = -i, the i-th argument had an illegal value.
  *          > 0:  the algorithm for computing the SVD failed to converge;
  *                if INFO = i, i off-diagonal elements of an intermediate
  *                bidiagonal form did not converge to zero.
  */
  integer M = A.Size(0), N = A.Size(1);
  integer NRHS = B_X.Size(1);
  integer LDA = M, LDB = M;
  zMatrix<double,zColMajor> S(N,1);
  double RCOND = -1.0;
  integer RANK; /* Output */
  integer LWORK = 16 * (3 * min(M,N) + max(max(2 * min(M,N), max(M,N)), NRHS));
  zMatrix<double,zColMajor> WORK(LWORK,1);
  integer INFO;

  /* Make the FORTRAN call */
  dgelss_(&M, &N, &NRHS, A.Data(), &LDA, B_X.Data(), &LDB, S.Data(), &RCOND, &RANK, WORK.Data(), &LWORK, &INFO);

  if (info!=NULL) *info=INFO;
  return INFO==0;
}

bool SolveLinearLeastSquare2(zMatrix<double,zColMajor> &A, zMatrix<double,zColMajor> &B_X, integer *info)
{
  /*
  *  DGELSY computes the minimum-norm solution to a real linear least
  *  squares problem:
  *      minimize || A * X - B ||
  *  using a complete orthogonal factorization of A.  A is an M-by-N
  *  matrix which may be rank-deficient.
  *
  *  Several right hand side vectors b and solution vectors x can be
  *  handled in a single call; they are stored as the columns of the
  *  M-by-NRHS right hand side matrix B and the N-by-NRHS solution
  *  matrix X.
  *
  *  The routine first computes a QR factorization with column pivoting:
  *      A * P = Q * [ R11 R12 ]
  *                  [  0  R22 ]
  *  with R11 defined as the largest leading submatrix whose estimated
  *  condition number is less than 1/RCOND.  The order of R11, RANK,
  *  is the effective rank of A.
  *
  *  Then, R22 is considered to be negligible, and R12 is annihilated
  *  by orthogonal transformations from the right, arriving at the
  *  complete orthogonal factorization:
  *     A * P = Q * [ T11 0 ] * Z
  *                 [  0  0 ]
  *  The minimum-norm solution is then
  *     X = P * Z' [ inv(T11)*Q1'*B ]
  *                [        0       ]
  *  where Q1 consists of the first RANK columns of Q.
  *
  *  This routine is basically identical to the original xGELSX except
  *  three differences:
  *    o The call to the subroutine xGEQPF has been substituted by the
  *      the call to the subroutine xGEQP3. This subroutine is a Blas-3
  *      version of the QR factorization with column pivoting.
  *    o Matrix B (the right hand side) is updated with Blas-3.
  *    o The permutation of matrix B (the right hand side) is faster and
  *      more simple.
  *
  *  Arguments
  *  =========
  *
  *  M       (input) INTEGER
  *          The number of rows of the matrix A.  M >= 0.
  *
  *  N       (input) INTEGER
  *          The number of columns of the matrix A.  N >= 0.
  *
  *  NRHS    (input) INTEGER
  *          The number of right hand sides, i.e., the number of
  *          columns of matrices B and X. NRHS >= 0.
  *
  *  A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
  *          On entry, the M-by-N matrix A.
  *          On exit, A has been overwritten by details of its
  *          complete orthogonal factorization.
  *
  *  LDA     (input) INTEGER
  *          The leading dimension of the array A.  LDA >= max(1,M).
  *
  *  B       (input/output) DOUBLE PRECISION array, dimension (LDB,NRHS)
  *          On entry, the M-by-NRHS right hand side matrix B.
  *          On exit, the N-by-NRHS solution matrix X.
  *
  *  LDB     (input) INTEGER
  *          The leading dimension of the array B. LDB >= max(1,M,N).
  *
  *  JPVT    (input/output) INTEGER array, dimension (N)
  *          On entry, if JPVT(i) .ne. 0, the i-th column of A is permuted
  *          to the front of AP, otherwise column i is a free column.
  *          On exit, if JPVT(i) = k, then the i-th column of AP
  *          was the k-th column of A.
  *
  *  RCOND   (input) DOUBLE PRECISION
  *          RCOND is used to determine the effective rank of A, which
  *          is defined as the order of the largest leading triangular
  *          submatrix R11 in the QR factorization with pivoting of A,
  *          whose estimated condition number < 1/RCOND.
  *
  *  RANK    (output) INTEGER
  *          The effective rank of A, i.e., the order of the submatrix
  *          R11.  This is the same as the order of the submatrix T11
  *          in the complete orthogonal factorization of A.
  *
  *  WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
  *          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
  *
  *  LWORK   (input) INTEGER
  *          The dimension of the array WORK.
  *          The unblocked strategy requires that:
  *             LWORK >= MAX(MN+3*N+1, 2*MN+NRHS),
  *          where MN = min(M, N).
  *          The block algorithm requires that:
  *             LWORK >= MAX(MN+2*N+NB*(N+1), 2*MN+NB*NRHS),
  *          where NB is an upper bound on the blocksize returned
  *          by ILAENV for the routines DGEQP3, DTZRZF, STZRQF, DORMQR,
  *          and DORMRZ.
  *
  *          If LWORK = -1, then a workspace query is assumed; the routine
  *          only calculates the optimal size of the WORK array, returns
  *          this value as the first entry of the WORK array, and no error
  *          message related to LWORK is issued by XERBLA.
  *
  *  INFO    (output) INTEGER
  *          = 0: successful exit
  *          < 0: If INFO = -i, the i-th argument had an illegal value.
  */
  integer M = A.Size(0), N = A.Size(1);
  integer NRHS = B_X.Size(1);
  integer LDA = M, LDB = M;
  zMatrix<integer,zColMajor> JPVT(Zeros<integer>(N,1));
  double RCOND = -1.0;
  integer RANK; /* Output */
  integer LWORK;
  zMatrix<double,zColMajor> WORK(1,1);
  integer INFO;

  //query first
  LWORK = -1;
  dgelsy_(&M, &N, &NRHS, A.Data(), &LDA, B_X.Data(), &LDB, JPVT.Data(), &RCOND, &RANK, WORK.Data(), &LWORK, &INFO);

  //real call
  LWORK = WORK(0,0);
  WORK.SetSize(LWORK,1);
  dgelsy_(&M, &N, &NRHS, A.Data(), &LDA, B_X.Data(), &LDB, JPVT.Data(), &RCOND, &RANK, WORK.Data(), &LWORK, &INFO);


  if (info!=NULL) *info=INFO;
  return INFO==0;
}

bool SolveLinear(zMatrix<double,zColMajor> &A, zMatrix<double,zColMajor> &B_X, long *info)
{
  /*
  *  DGESV computes the solution to a real system of linear equations
  *     A * X = B,
  *  where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
  *
  *  The LU decomposition with partial pivoting and row interchanges is
  *  used to factor A as
  *     A = P * L * U,
  *  where P is a permutation matrix, L is unit lower triangular, and U is
  *  upper triangular.  The factored form of A is then used to solve the
  *  system of equations A * X = B.
  *
  *  Arguments
  *  =========
  *
  *  N       (input) INTEGER
  *          The number of linear equations, i.e., the order of the
  *          matrix A.  N >= 0.
  *
  *  NRHS    (input) INTEGER
  *          The number of right hand sides, i.e., the number of columns
  *          of the matrix B.  NRHS >= 0.
  *
  *  A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
  *          On entry, the N-by-N coefficient matrix A.
  *          On exit, the factors L and U from the factorization
  *          A = P*L*U; the unit diagonal elements of L are not stored.
  *
  *  LDA     (input) INTEGER
  *          The leading dimension of the array A.  LDA >= max(1,N).
  *
  *  IPIV    (output) INTEGER array, dimension (N)
  *          The pivot indices that define the permutation matrix P;
  *          row i of the matrix was interchanged with row IPIV(i).
  *
  *  B       (input/output) DOUBLE PRECISION array, dimension (LDB,NRHS)
  *          On entry, the N-by-NRHS matrix of right hand side matrix B.
  *          On exit, if INFO = 0, the N-by-NRHS solution matrix X.
  *
  *  LDB     (input) INTEGER
  *          The leading dimension of the array B.  LDB >= max(1,N).
  *
  *  INFO    (output) INTEGER
  *          = 0:  successful exit
  *          < 0:  if INFO = -i, the i-th argument had an illegal value
  *          > 0:  if INFO = i, U(i,i) is exactly zero.  The factorization
  *                has been completed, but the factor U is exactly
  *                singular, so the solution could not be computed.
  */
  ZCHECK_EQ(A.Size(0), A.Size(1));
  integer N=A.Size(0);
  integer NRHS=B_X.Size(1);
  integer LDA=N,LDB=N,INFO;
  zMatrix<integer,zColMajor> IPIV(N,1);
  dgesv_(&N, &NRHS, A.Data(), &LDA, IPIV.Data(), B_X.Data(), &LDB, &INFO);
  if (info!=NULL) *info=INFO;
  return INFO==0;
}

bool CholeskyFactorization(zMatrix<double,zColMajor> &A, long *info)
{
  /*
  *  DPOTRF computes the Cholesky factorization of a real symmetric
  *  positive definite matrix A.
  *
  *  The factorization has the form
  *     A = U**T * U,  if UPLO = 'U', or
  *     A = L  * L**T,  if UPLO = 'L',
  *  where U is an upper triangular matrix and L is lower triangular.
  *
  *  This is the block version of the algorithm, calling Level 3 BLAS.
  *
  *  Arguments
  *  =========
  *
  *  UPLO    (input) CHARACTER*1
  *          = 'U':  Upper triangle of A is stored;
  *          = 'L':  Lower triangle of A is stored.
  *
  *  N       (input) INTEGER
  *          The order of the matrix A.  N >= 0.
  *
  *  A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
  *          On entry, the symmetric matrix A.  If UPLO = 'U', the leading
  *          N-by-N upper triangular part of A contains the upper
  *          triangular part of the matrix A, and the strictly lower
  *          triangular part of A is not referenced.  If UPLO = 'L', the
  *          leading N-by-N lower triangular part of A contains the lower
  *          triangular part of the matrix A, and the strictly upper
  *          triangular part of A is not referenced.
  *
  *          On exit, if INFO = 0, the factor U or L from the Cholesky
  *          factorization A = U**T*U or A = L*L**T.
  *
  *  LDA     (input) INTEGER
  *          The leading dimension of the array A.  LDA >= max(1,N).
  *
  *  INFO    (output) INTEGER
  *          = 0:  successful exit
  *          < 0:  if INFO = -i, the i-th argument had an illegal value
  *          > 0:  if INFO = i, the leading minor of order i is not
  *                positive definite, and the factorization could not be
  *                completed.
  *
  */
  ZCHECK_EQ(A.Size(0), A.Size(1));
  integer N=A.Size(0), LDA=N, INFO;
  char UPLO = 'U';

  /* Call lapack routine */
  dpotrf_(&UPLO, &N, A.Data(), &LDA, &INFO);

  if (info!=NULL) *info=INFO;
  return INFO==0;
}
#else
bool LUFactorization(zMatrix<double,zColMajor> &mat, zMatrix<long,zColMajor> &IPIV, long *info) {
  ZLOGE<<"Not implemented due to the lack of the Lapack\n";
  return false;
}
bool CholeskyFactorization(zMatrix<double,zColMajor> &A, long *info){
  ZLOGE<<"Not implemented due to the lack of the Lapack\n";
  return false;
}
bool Invert(zMatrix<double,zColMajor> &mat, long *info){
  ZLOGE<<"Not implemented due to the lack of the Lapack\n";
  return false;
}
bool SVD(zMatrix<double,zColMajor> &A, zMatrix<double,zColMajor> &U, zMatrix<double,zColMajor> &S, zMatrix<double,zColMajor> &VT, long *info){
  ZLOGE<<"Not implemented due to the lack of the Lapack\n";
  return false;
}
bool EigenSym(zMatrix<double,zColMajor> &A, zMatrix<double,zColMajor> &e, long *info){
  ZLOGE<<"Not implemented due to the lack of the Lapack\n";
  return false;
}
double Det(zMatrix<double,zColMajor> &A){
  ZLOGE<<"Not implemented due to the lack of the Lapack\n";
  return false;
}
bool EigRight(zMatrix<double,zColMajor> &A, zMatrix<double,zColMajor> &eigenVector, zMatrix<double,zColMajor> &eigenValueReal, zMatrix<double,zColMajor> &eigenValueImag, long *info){
  ZLOGE<<"Not implemented due to the lack of the Lapack\n";
  return false;
}
bool SolveLinearLeastSquare(zMatrix<double,zColMajor> &A, zMatrix<double,zColMajor> &B_X, long *info){
  ZLOGE<<"Not implemented due to the lack of the Lapack\n";
  return false;
}
bool SolveLinearLeastSquare2(zMatrix<double,zColMajor> &A, zMatrix<double,zColMajor> &B_X, long *info){
  ZLOGE<<"Not implemented due to the lack of the Lapack\n";
  return false;
}
bool SolveLinear(zMatrix<double,zColMajor> &A, zMatrix<double,zColMajor> &B_X, long *info){
  ZLOGE<<"Not implemented due to the lack of the Lapack\n";
  return false;
}
#endif
}
